This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
install.packages("tidyverse")
also installing the dependencies ‘DBI’, ‘dbplyr’, ‘modelr’, ‘reprex’
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/DBI_1.0.0.tgz'
Content type 'application/x-gzip' length 876902 bytes (856 KB)
==================================================
downloaded 856 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/dbplyr_1.4.2.tgz'
Content type 'application/x-gzip' length 569902 bytes (556 KB)
==================================================
downloaded 556 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/modelr_0.1.5.tgz'
Content type 'application/x-gzip' length 198251 bytes (193 KB)
==================================================
downloaded 193 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/reprex_0.3.0.tgz'
Content type 'application/x-gzip' length 423802 bytes (413 KB)
==================================================
downloaded 413 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/tidyverse_1.2.1.tgz'
Content type 'application/x-gzip' length 88754 bytes (86 KB)
==================================================
downloaded 86 KB
The downloaded binary packages are in
/var/folders/jb/5s85b1c570b6rhdk3p3yr6xr0000gn/T//RtmpQVYylN/downloaded_packages
library(ggplot2)
library(dplyr)
library(gridExtra)
library(magrittr)
library(RColorBrewer)
attach(Housesale)
The following objects are masked from Housesale (pos = 3):
bathrooms, bedrooms, condition, date, floors, grade, id, isreno, lat, long, price, sqft_above,
sqft_basement, sqft_living, sqft_living15, sqft_lot, sqft_lot15, view, waterfront, yr_built,
yr_renovated, zipcode
The following objects are masked from Housesale (pos = 5):
bathrooms, bedrooms, condition, date, floors, grade, id, isreno, lat, long, price, sqft_above,
sqft_basement, sqft_living, sqft_living15, sqft_lot, sqft_lot15, view, waterfront, yr_built,
yr_renovated, zipcode
The following objects are masked from Housesale (pos = 7):
bathrooms, bedrooms, condition, date, floors, grade, id, isreno, lat, long, price, sqft_above,
sqft_basement, sqft_living, sqft_living15, sqft_lot, sqft_lot15, view, waterfront, yr_built,
yr_renovated, zipcode
The following objects are masked from Housesale (pos = 11):
bathrooms, bedrooms, condition, date, floors, grade, id, lat, long, price, sqft_above,
sqft_basement, sqft_living, sqft_living15, sqft_lot, sqft_lot15, view, waterfront, yr_built,
yr_renovated, zipcode
library(readr)
library(caret)
library(corrplot)
library(caTools)
library(tidyverse)
[30m── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mgridExtra[30m::[32mcombine()[30m masks [34mdplyr[30m::combine()
[31m✖[30m [34mtidyr[30m::[32mextract()[30m masks [34mmagrittr[30m::extract()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mpurrr[30m::[32mlift()[30m masks [34mcaret[30m::lift()
[31m✖[30m [34mpurrr[30m::[32mset_names()[30m masks [34mmagrittr[30m::set_names()[39m
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:gridExtra’:
combine
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
## Importing data from CSV and summary
getwd()
[1] "/Users/saileshraturi/Documents/GitHub/Regression-House-Sale"
setwd("/Users/saileshraturi/Documents/GitHub/Regression-House-Sale")
Housesale = read.csv("kc_house_data.csv")
Housesale
head(Housesale)
str(Housesale)
'data.frame': 21613 obs. of 21 variables:
$ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
$ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
$ price : num 221900 538000 180000 604000 510000 ...
$ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
$ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
$ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
$ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
$ floors : num 1 2 1 1 1 1 2 1 1 2 ...
$ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
$ view : int 0 0 0 0 0 0 0 0 0 0 ...
$ condition : int 3 3 3 5 3 3 3 3 3 3 ...
$ grade : int 7 7 6 7 8 11 7 7 7 7 ...
$ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
$ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
$ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
$ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
$ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
$ lat : num 47.5 47.7 47.7 47.5 47.6 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
$ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
summary(Housesale)
id date price bedrooms bathrooms sqft_living
Min. :1.000e+06 20140623T000000: 142 Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
1st Qu.:2.123e+09 20140625T000000: 131 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
Median :3.905e+09 20140626T000000: 131 Median : 450000 Median : 3.000 Median :2.250 Median : 1910
Mean :4.580e+09 20140708T000000: 127 Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080
3rd Qu.:7.309e+09 20150427T000000: 126 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
Max. :9.900e+09 20150325T000000: 123 Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
(Other) :20833
sqft_lot floors waterfront view condition grade
Min. : 520 Min. :1.000 Min. :0.000000 Min. :0.0000 Min. :1.000 Min. : 1.000
1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 7.000
Median : 7618 Median :1.500 Median :0.000000 Median :0.0000 Median :3.000 Median : 7.000
Mean : 15107 Mean :1.494 Mean :0.007542 Mean :0.2343 Mean :3.409 Mean : 7.657
3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 8.000
Max. :1651359 Max. :3.500 Max. :1.000000 Max. :4.0000 Max. :5.000 Max. :13.000
sqft_above sqft_basement yr_built yr_renovated zipcode lat
Min. : 290 Min. : 0.0 Min. :1900 Min. : 0.0 Min. :98001 Min. :47.16
1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47
Median :1560 Median : 0.0 Median :1975 Median : 0.0 Median :98065 Median :47.57
Mean :1788 Mean : 291.5 Mean :1971 Mean : 84.4 Mean :98078 Mean :47.56
3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68
Max. :9410 Max. :4820.0 Max. :2015 Max. :2015.0 Max. :98199 Max. :47.78
long sqft_living15 sqft_lot15
Min. :-122.5 Min. : 399 Min. : 651
1st Qu.:-122.3 1st Qu.:1490 1st Qu.: 5100
Median :-122.2 Median :1840 Median : 7620
Mean :-122.2 Mean :1987 Mean : 12768
3rd Qu.:-122.1 3rd Qu.:2360 3rd Qu.: 10083
Max. :-121.3 Max. :6210 Max. :871200
log_sqftliving = log10(sqft_living)
Housesale = Housesale %>% mutate(log_price = log10(price))
x = min(sqft_living)
y = max(sqft_living)
z = (y-x)
#taking anitlog
l1 = exp(2.9)
l1
[1] 18.17415
l2 = exp(3.6)
l2
[1] 36.59823
#Plot for Price
g1 = ggplot(Housesale, aes(x = log_price)) + geom_histogram(fill = "red", binwidth = .10)
#Plot for sqftliving
g2 = ggplot(Housesale, aes(x = log_sqftliving)) + geom_histogram(fill = "blue", binwidth = .20)
# grid.arrange(g1,g2,nrow = 1,ncol = 2)
#House Price : Plot reflect the most of the prices of house is lie between 5.4 to 6 million.
#Sqftliving : Most of houses have sqft living between 1800 to 3600 sqft.
#plot for bathroom
ggplot(Housesale, aes(x = bathrooms)) + geom_histogram(fill = "tomato", binwidth = 0.5) + scale_x_continuous(limits = c(1,8))
x = max(bathrooms)
y = min(bathrooms)
z = mad(bathrooms)
#House Price vs size
#jitter used to reduce overlapping
g3 = ggplot(Housesale, aes(x = log_sqftliving, y = log_price)) + geom_jitter(alpha = 0.5, size = 2, color = "brown") + stat_smooth(method = "lm", se = F, span = 0.7) + labs("title = sqftliving vs Price")
#HousePrice vs Bedroom
mycolors = c(brewer.pal(name = "Dark2", n=8), brewer.pal(name="Paired", n=6))
#Housesale = Housesale %>% filter(bedrooms < 30)
#Housesale
g4 = ggplot(Housesale, aes(x = bedrooms, y = log_price, col = bedrooms)) + geom_point(alpha = 0.5, size = 2) + geom_smooth(method = "lm",se = F) + scale_color_gradientn(colors = mycolors)
grid.arrange(g3,g4,nrow = 1, ncol = 2)
#ggplot(aes(Housesale,x=bedrooms,y=log_price))+
#geom_point(alpha=0.5,size=2)+
#geom_smooth(method="lm",se=F)+
#labs("title=Bedrooms vs Price")+scale_color_gradientn(colors=mycolors)+theme(legend.position="none")
g5 = ggplot(Housesale, aes(x = sqft_basement, y = log_price)) + geom_point( col = "green", alpha = 0.5) + stat_smooth(method = "lm", se = F, alpha = 0.6, size = 0.5)
g6 = ggplot(Housesale, aes(x = yr_built, y = log_price)) + geom_jitter(col = "blue", alpha = 0.5) + geom_smooth(method = "auto", se = T)
grid.arrange(g5,g6,nrow = 1, ncol = 2)
table(condition)
condition
1 2 3 4 5
30 172 14031 5679 1701
Housesale %>% group_by(factor(condition)) %>% summarise(mean_price = mean(log_price), sd = sd(log_price), count = n())
# Distribution of Houseprice according to condition of house
ggplot(Housesale, aes(x = factor(condition), y = log_price, fill = factor(condition))) + geom_boxplot()
# Relationship between, size, price and condition
ggplot(Housesale, aes(x = log_sqftliving, y = log_price, color = factor(condition))) + geom_point(alpha = 0.5) + geom_smooth(method = "lm", se = F, color = "Black") + facet_wrap(~condition)
#grid.arrange(g7, g8, nrow = 1, ncol = 2)
table(floors)
Housesale %>% group_by(flr = factor(floors)) %>% summarise(floor_cnt = n()) %>%
ggplot(aes(x = flr,floor_cnt, fill = flr)) + geom_bar(stat = "identity")
#hist(floors)
ggplot(Housesale, aes(x = factor(floors), y = (log_price),fill = factor(floors))) + geom_boxplot()
#ggplot(Housesale, aes(x = yr_built, y = log_price)) + geom_point()
#+
#Houses bulit yearly
Housesale %>% ggplot(aes(x = yr_built)) + geom_histogram(binwidth = 5, fill = rainbow(1), alpha = 0.5) + scale_x_continuous(limits = c(1900,2016))
# House built year wise vs size of house(sqft)
options(repr.plot.width = 10, repr.plot.height = 6)
ggplot(Housesale, aes(x = factor(yr_built), y = log_sqftliving, fill = factor(yr_built))) + geom_boxplot() + theme(legend.position = "none")
ggplot(Housesale, aes(x=yr_built, y = log_sqftliving, color = "green")) + geom_jitter(alpha =0.5, size = 0.5) + stat_smooth(method = "auto", color = "black")
# trend of increase in sqft living 1950 onwards till 1990
#House View
table(Housesale$waterfront)
Housesale$houseview = ifelse(waterfront ==1,TRUE,FALSE)
#ggplot(Housesale, aes(x = houseview, y = log_price, fill = factor(waterfront))) + geom_boxplot()
# Most of the houses doesnot have waterfront while houses with waterfront are more expensive
Housesale %>% group_by(houseview) %>% summarise(meanprice = mean(log_price),housecount = n() )
ggplot(Housesale,aes( x = log_sqftliving, y= log_price, col = houseview)) + geom_point(alpha = 0.5) + geom_smooth(method = "lm" ,size =0.5, color = "black") + scale_color_manual(values = rainbow(n=12)) +facet_wrap(~houseview)
# sold houses which have waterfront are expensive and high sqftliving but less in number
table(grade)
#grade vs price
#ggplot(Housesale, aes(x = factor(grade), y = log_price, fill = factor(grade))) + geom_boxplot(alpha = 0.5)
# grade vs sqftliving vs price
ggplot(Housesale, aes(x = log_sqftliving, y = log_price, color = factor(grade))) + geom_point(alpha = 0.5) + facet_wrap(~grade) + geom_smooth(method = "lm", color = "black") + scale_color_manual(values = rainbow(n=12)) + theme(legend.position = "none")
# Renovate year
table(Housesale$yr_renovated)
Housesale$isreno = ifelse(yr_renovated == 0, FALSE, TRUE)
table(Housesale$isreno)
# histogram of yr renovated
ggplot(Housesale, aes(yr_renovated)) + geom_histogram(alpha = 0.5,binwidth = 1, fill = rainbow(1)) + scale_x_continuous(limits = c(1900,2016))
#ggplot(Housesale, aes(x = factor(yr_renovated), y= log_price, fill = factor(yr_renovated))) + geom_boxplot()
# year built vs price vs renovate
ggplot(Housesale, aes(x = yr_built, y = log_price, col = yr_renovated)) + geom_jitter(alpha = 0.5)
#Renovate year vs year built
ggplot(Housesale, aes(x = yr_built, y = yr_renovated, color = isreno)) + geom_jitter(alpha = 0.5) + facet_wrap(~isreno)
#splitting the data into train and test subset
set.seed(0512)
sample = sample.split(Housesale, SplitRatio = .70)
trainhs = subset(Housesale, sample == TRUE)
tesths = subset(Housesale, sample == FALSE)
nrow(Housesale)
[1] 21613
ncol(Housesale)
[1] 22
nrow(trainhs)
[1] 14736
nrow(tesths)
[1] 6877
# variable significance and checking correlation
corr = cor(Housesale[,3:21])
corrplot(corr)
NA
NA
NA
# model creation using linear regression(parameters on basis of correleation matrix)
modellm1 = lm(log(price) ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + yr_built + yr_renovated + zipcode + lat + sqft_living15 + sqft_lot15 , data = trainhs)
summary(modellm1)
#plot(modellm1)
# model creation using linear regression or training model( all parameters)
modellm2 = lm(log(price) ~ ., data = trainhs[,3:22])
summary(modellm2)
#plot(modellm2)
# model creation using linear regression using cross-validation technique(partitioning dataset into random partition - train and test(number of folds), average of accuracy metrics for all folds taken to come across how training model will perform on unknown test dataset)
#modellm3 = train(log(price) ~ .,data = trainhs[,3:22], method = "lm", trControl = trainControl(method = "cv",number = 10, savePredictions = TRUE))
install.packages("ggfortify")
modellm3 = train(price ~ .,data = Housesale, method = "lm", trControl = trainControl(method = "cv",number = 5, verboseIter = TRUE))
summary(modellm3)
library(ggfortify)
autoplot(modellm3,ncol = 2)
# model creation using linear regression using cross-validation technique(partitioning dataset into random partition - train and test(number of folds), average of accuracy metrics for all folds taken to come across how training model will perform on unknown test dataset)
modellm6 = train(log(price) ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + yr_built + yr_renovated + lat + sqft_living15 + sqft_lot15 + I(bedrooms^2) + I(sqft_living^2) + I(view^2) + I(grade^2) + I(yr_built^2) + I(yr_renovated^2) + I(lat^2) + I(sqft_living15^2)+ I(sqft_lot15^2),data = trainhs[,3:22], method = "lm", trControl = trainControl(method = "cv",number = 10, savePredictions = TRUE))
summary(modellm6)
#parametergrid <- expand.grid(c(5,6,7,8))
#summary(modellm3)
#checking residuals plot for each parameter
g1 = ggplot(trainhs, aes(bathrooms, residuals(modellm1))) + geom_point() + geom_smooth()
g2 = ggplot(trainhs, aes(sqft_living, residuals(modellm1))) + geom_point() + geom_smooth()
g3 = ggplot(trainhs, aes(view, residuals(modellm1))) + geom_point() + geom_smooth()
g4 = ggplot(trainhs, aes(grade, residuals(modellm1))) + geom_point() + geom_smooth()
g5 = ggplot(trainhs, aes(yr_built, residuals(modellm1))) + geom_point() + geom_smooth()
g6 =ggplot(trainhs, aes(sqft_living15, residuals(modellm1))) + geom_point() + geom_smooth()
g7 = ggplot(trainhs, aes(bedrooms, residuals(modellm1))) + geom_point() + geom_smooth()
g8 = ggplot(trainhs, aes(waterfront, residuals(modellm1))) + geom_point() + geom_smooth()
g9 = ggplot(trainhs, aes(lat, residuals(modellm1))) + geom_point() + geom_smooth()
g10 =ggplot(trainhs, aes(sqft_lot15, residuals(modellm1))) + geom_point() + geom_smooth()
g11 =ggplot(trainhs, aes(condition, residuals(modellm1))) + geom_point() + geom_smooth()
g12 =ggplot(trainhs, aes(yr_renovated, residuals(modellm1))) + geom_point() + geom_smooth()
g13 =ggplot(trainhs, aes(zipcode, residuals(modellm1))) + geom_point() + geom_smooth()
grid.arrange(g1,g2,g3,g4, nrow = 1, ncol = 4)
grid.arrange(g5,g6,g7,g8, nrow = 1, ncol = 4)
grid.arrange(g9,g10,g11,g12,g13, nrow = 1, ncol = 5)
#model creation using nonlinearlity of parameters based on residuals plot
#modellm4 = lm(log(price) ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + yr_built + yr_renovated + zipcode + lat + sqft_living15 + sqft_lot15 + I(bedrooms^2) + I(bathrooms^2) + I(sqft_living^2)+ I(waterfront^2) + I(view^2) + I(condition^2) + I(grade^2) + I(yr_built^2) + I(yr_renovated^2) +I(zipcode^2) + I(lat^2) + I(sqft_living15^2)+ I(sqft_lot15^2) , data = trainhs)
#summary(modellm4)
#plot(modellm4)
modellm4 = lm(log(price) ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + yr_built + yr_renovated + zipcode + lat + sqft_living15 + sqft_lot15 + I(bedrooms^2) + I(bathrooms^2) + I(sqft_living^2) + I(grade^2) + I(yr_built^2) + I(yr_renovated^2) +I(zipcode^2) + I(lat^2) + I(sqft_living15^2)+ I(sqft_lot15^2) , data = trainhs)
summary(modellm4)
#model creation after removing less significant parameters identified in modellm4
#modellm5 = update(modellm4, ~.-zipcode-I(bathrooms^2)-I(waterfront^2)-I(condition^2)-I(zipcode^2))
modellm5 = lm(log(price) ~ bedrooms + bathrooms + sqft_living + waterfront + view + condition + grade + yr_built + yr_renovated + lat + sqft_living15 + sqft_lot15 + I(bedrooms^2) + I(sqft_living^2) + I(view^2) + I(grade^2) + I(yr_built^2) + I(yr_renovated^2) + I(lat^2) + I(sqft_living15^2)+ I(sqft_lot15^2) , data = trainhs)
summary(modellm5)
Call:
lm(formula = log(price) ~ bedrooms + bathrooms + sqft_living +
waterfront + view + condition + grade + yr_built + yr_renovated +
lat + sqft_living15 + sqft_lot15 + I(bedrooms^2) + I(sqft_living^2) +
I(view^2) + I(grade^2) + I(yr_built^2) + I(yr_renovated^2) +
I(lat^2) + I(sqft_living15^2) + I(sqft_lot15^2), data = trainhs)
Residuals:
Min 1Q Median 3Q Max
-1.25879 -0.14747 -0.00418 0.14024 1.14221
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.067e+03 2.283e+02 -30.948 < 2e-16 ***
bedrooms -2.185e-02 4.213e-03 -5.187 2.16e-07 ***
bathrooms 5.025e-02 4.649e-03 10.809 < 2e-16 ***
sqft_living 2.146e-04 9.006e-06 23.835 < 2e-16 ***
waterfront 4.151e-01 2.783e-02 14.918 < 2e-16 ***
view 9.124e-02 9.776e-03 9.333 < 2e-16 ***
condition 7.457e-02 3.418e-03 21.817 < 2e-16 ***
grade 1.809e-01 1.776e-02 10.188 < 2e-16 ***
yr_built -2.041e-01 9.199e-03 -22.188 < 2e-16 ***
yr_renovated -5.557e-03 6.422e-04 -8.653 < 2e-16 ***
lat 3.050e+02 9.574e+00 31.857 < 2e-16 ***
sqft_living15 2.564e-04 1.569e-05 16.339 < 2e-16 ***
sqft_lot15 8.378e-07 1.205e-07 6.955 3.68e-12 ***
I(bedrooms^2) 8.598e-04 2.902e-04 2.962 0.003057 **
I(sqft_living^2) -9.959e-09 1.247e-09 -7.985 1.51e-15 ***
I(view^2) -9.144e-03 3.168e-03 -2.886 0.003904 **
I(grade^2) -1.508e-03 1.107e-03 -1.363 0.172994
I(yr_built^2) 5.134e-05 2.347e-06 21.880 < 2e-16 ***
I(yr_renovated^2) 2.806e-06 3.218e-07 8.720 < 2e-16 ***
I(lat^2) -3.194e+00 1.007e-01 -31.714 < 2e-16 ***
I(sqft_living15^2) -3.648e-08 3.193e-09 -11.426 < 2e-16 ***
I(sqft_lot15^2) -1.053e-12 2.992e-13 -3.520 0.000433 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.239 on 14714 degrees of freedom
Multiple R-squared: 0.7926, Adjusted R-squared: 0.7923
F-statistic: 2677 on 21 and 14714 DF, p-value: < 2.2e-16
plot(modellm5)
NA
# rmse for regression model
testmodel = predict(modellm7, tesths[,3:21])
plot(exp(testmodel) ~ tesths$price)
#plot(testmodel ~ tesths$price)
abline(a = 0, b = 1)
res1 = exp(testmodel) - tesths$price
#res1 = testmodel - tesths$price
#rmse <- sqrt(sum((exp(testmodel) - tesths$price)^2)/length(tesths$price))
rmse = sqrt(mean(res1^2))
rmse
[1] 199403.9
#trying random forest model
modelrf = randomForest(price ~ ., trainhs[,3:21],mtry = 6, importance = TRUE)
summary(modelrf)
importance(modelrf)
varImpPlot(modelrf,type = 2)
para = sqft_living+grade+lat+sqft_living15+sqft_above+long+bathrooms+yr_built + view
modellm7 = lm(log(price) ~ sqft_living + grade + lat + sqft_living15 + sqft_above + long + bathrooms + yr_built + view, data = trainhs[,3:21])
summary(modellm7)
Call:
lm(formula = log(price) ~ sqft_living + grade + lat + sqft_living15 +
sqft_above + long + bathrooms + yr_built + view, data = trainhs[,
3:21])
Residuals:
Min 1Q Median 3Q Max
-1.79935 -0.16215 0.00189 0.16113 1.24437
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.151e+01 2.327e+00 -22.133 < 2e-16 ***
sqft_living 1.274e-04 6.197e-06 20.553 < 2e-16 ***
grade 1.681e-01 3.300e-03 50.925 < 2e-16 ***
lat 1.333e+00 1.596e-02 83.499 < 2e-16 ***
sqft_living15 9.225e-05 5.284e-06 17.459 < 2e-16 ***
sqft_above 1.332e-05 6.033e-06 2.208 0.02729 *
long -5.337e-02 1.777e-02 -3.003 0.00268 **
bathrooms 8.879e-02 4.723e-03 18.800 < 2e-16 ***
yr_built -3.712e-03 9.879e-05 -37.572 < 2e-16 ***
view 7.622e-02 3.037e-03 25.095 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2587 on 14726 degrees of freedom
Multiple R-squared: 0.7569, Adjusted R-squared: 0.7567
F-statistic: 5093 on 9 and 14726 DF, p-value: < 2.2e-16
#rmse for random forest model
testmodel_rf = predict(modelrf, tesths)
#p1 = plot(tesths$price ~ exp(testmodel))
#abline(a =0, b =1)
#p2 = plot(tesths$price ~ testmodel_rf)
#abline(a =0, b =1)
#grid.arrange(p1,p2, nrow = 1, ncol = 2)
res1 = testmodel_rf - tesths$price
rmse = sqrt(mean(res1^2))
rmse
[1] 135938.7